
********************************************************************************
*
* This program creates an input file of moves that are used to find the 
* largest strongly connected set of establishments. 
*  
* The main input of this file is movers_ver$ver.dta which is a person-establishment
* level file recording all moves. The hockey stick functions are used to weight
* moves by the probability that they are endogenous. The file output is 
* analyzed using the BGL functions in Matlab to extract
* the largest strongly connected set.
********************************************************************************



use "$temp/movers_ver$ver.dta", clear

*This file now contains the set of moves over which we would like to find the largest strongly connected set. 

keep currentid toid year q
rename currentid betnr

*Merge with ee, en rates by sending firm and year
merge m:1 betnr year q using "$results/estab_eeenrates_byyr_ver$ver.dta"
tab _merge

replace _merge=3 if betnr==1 //non-employment does not have estab stats
tab _merge
drop if _merge == 1 // no estimated eeenrates_byyr, either small establishment that we dropped or singelton obs (birth and death or dropped)
tab _merge
keep if _merge==3 // merge=2 establishments without observed flow, rate imputed
drop _merge

rename betnr currentid 
preserve
keep if currentid==1
keep toid
rename toid currentid
duplicates drop
save "$temp/temp", replace
restore

merge m:1 currentid using "$temp/temp"
replace _merge = 3 if currentid==1
tab _merge
keep if _merge == 3
drop _merge
erase "$temp/temp.dta"

assert !missing(eerate_avgall) if currentid != 1
assert !missing(enrate_avgall) if currentid != 1


keep currentid toid year q
save "$temp/str_conn_input_ver$ver.dta", replace

*-------------------------------------------------------------------------------
*Part2: prepare the set of establishments to determine the largest strongly 
*connected set
*-------------------------------------------------------------------------------

*Create id's corresponding to matrix entries for adjacency matrix
rename currentid a1
rename toid a2
gen double i = _n
reshape long a, i(i) j(j)
keep a
duplicates drop 
gen double estabid = a
sort estabid
gen matrix_estabid = _n
count
compress
save "$temp/estab_matrix_ids_ver$ver.dta", replace
save "$temp/estab_matrix_ids_orig_ver$ver.dta", replace

*Re-input the trimmed movers file
use $temp/str_conn_input_ver$ver, clear

*Get sender matrix id
rename currentid estabid
merge m:1 estabid using "$temp/estab_matrix_ids_ver$ver.dta"
keep if _merge==3 // _merge == 3: estabid with out toid
drop _merge a

*Obtain the list of senders that meet all the restrictions from part 1
preserve
	keep estabid matrix_estabid
	duplicates drop
	count
	save "$temp/restricted_senders_ver$ver.dta", replace
restore

rename estabid currentid
rename matrix_estabid matrix_currentid

*Get receiver matrix id 
rename toid estabid
merge m:1 estabid using "$temp/estab_matrix_ids_ver$ver"
keep if _merge==3
drop _merge

*Throw out receivers that are not in the sender list
*Note: this step is crucial as it makes the flows matrix square!
merge m:1 estabid using $temp/restricted_senders_ver$ver
tab _merge
*_merge==1 is a receiving establishment that are not in the restricted sender list
*_merge==2 is are estabs in the restricted sender list that are not receivers
keep if _merge==3
drop _merge a
rename estabid toid
rename matrix_estabid matrix_toid
keep matrix*

*Flows
gen flows = 1 //not actual flows for NE 

*Make matrix square
sum matrix_currentid, d
local mxcurr=r(max)
sum matrix_toid, d
local mxto=r(max)
local obstoadd = abs(`mxcurr'-`mxto')
preserve
	clear
	set obs `obstoadd'
	gen matrix_currentid=. 
	gen matrix_toid=.
	gen flows=0
	forvalues i = 1/`obstoadd' {
		local id = `mxto'+`i'
		replace matrix_currentid=`id' if _n==`i'
		replace matrix_toid=`id' if _n==`i'
	}
	tempfile squareobs
	save `squareobs', replace
restore

append using `squareobs'


*Collapse by sender-receiver pairs
collapse (sum) flows, by(matrix_currentid matrix_toid)
label var flows "flows"

compress

export delimited using "$matlab\sconn_idfy\flows_ver$ver.csv", datafmt   replace

